{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/crawl/karangan.net/karangan.net.json\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/crawl/ipendidikan/nested-ipendidikan.json\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/summarization/karangan/1.rtf\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/summarization/karangan/2.rtf\n",
    "# !pip3 install striprtf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from striprtf.striprtf import rtf_to_text\n",
    "\n",
    "with open('1.rtf') as fopen:\n",
    "    x = fopen.read()\n",
    "\n",
    "results = []\n",
    "splitted = rtf_to_text(x,errors=\"ignore\").split('===')\n",
    "for i in range(0, len(splitted), 2):\n",
    "    results.append(splitted[i: i + 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "from unidecode import unidecode\n",
    "\n",
    "remove = ['(AR5)', '(AR1)', '(AR2)', '(AR3)', '(AR4)', '(AA)', '(AH1)', '(AH2)', '(AH3)', '(AC)',\n",
    "'(AH4)', '(AP)', '(A Ksmpln)', '(A Cdngn)', '(A Pndpt)', '(A Pntp)', '(AJ)', '(AH5)']\n",
    "\n",
    "def cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string)).strip()\n",
    "\n",
    "def cleaning_rtf(string):\n",
    "    for r in remove:\n",
    "        string = string.replace(r, '')\n",
    "    return cleaning(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "before, after = [], []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for row in results:\n",
    "    b = [i for i in row[0].split('\\n') if len(i) > 2]\n",
    "    b = [b_ for b_ in b if b_.count(' ') > 2 and len(b_.split()) < 15]\n",
    "    a = cleaning_rtf(row[1])\n",
    "    \n",
    "    if len(b) > 2 and len(a) > 10:\n",
    "    \n",
    "        before.append(b)\n",
    "        after.append(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('karangan.net.json') as fopen:\n",
    "    karangan = json.load(fopen)\n",
    "    \n",
    "for k in karangan:\n",
    "    b = k['h']\n",
    "    b = [b_ for b_ in b if b_.count(' ') > 2 and len(b_.split()) < 15]\n",
    "    a = cleaning(' '.join(k['p']))\n",
    "    \n",
    "    if len(b) > 2 and len(a) > 10:\n",
    "    \n",
    "        before.append(b)\n",
    "        after.append(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('nested-ipendidikan.json') as fopen:\n",
    "    karangan = json.load(fopen)\n",
    "    \n",
    "for k in karangan:\n",
    "    b = k['li'][2:]\n",
    "    b = [b_ for b_ in b if b_.count(' ') > 2 and len(b_.split()) < 15]\n",
    "    a = cleaning(' '.join(k['p']))\n",
    "    \n",
    "    if len(b) > 2 and len(a) > 10:\n",
    "    \n",
    "        before.append(b)\n",
    "        after.append(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "367"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(before)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "def simple_cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\\n', ' ')).strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords = \"\"\"\n",
    "ada\n",
    "adakah\n",
    "adakan\n",
    "adalah\n",
    "adanya\n",
    "adapun\n",
    "agak\n",
    "agar\n",
    "akan\n",
    "aku\n",
    "akulah\n",
    "akupun\n",
    "al\n",
    "alangkah\n",
    "allah\n",
    "amat\n",
    "antara\n",
    "antaramu\n",
    "antaranya\n",
    "apa\n",
    "apa-apa\n",
    "apabila\n",
    "apakah\n",
    "apapun\n",
    "atas\n",
    "atasmu\n",
    "atasnya\n",
    "atau\n",
    "ataukah\n",
    "ataupun\n",
    "bagaimana\n",
    "bagaimanakah\n",
    "bagi\n",
    "bagimu\n",
    "baginya\n",
    "bahawa\n",
    "bahawasanya\n",
    "bahkan\n",
    "bahwa\n",
    "banyak\n",
    "banyaknya\n",
    "barangsiapa\n",
    "bawah\n",
    "beberapa\n",
    "begitu\n",
    "begitupun\n",
    "belaka\n",
    "belum\n",
    "belumkah\n",
    "berada\n",
    "berapa\n",
    "berikan\n",
    "beriman\n",
    "berkenaan\n",
    "berupa\n",
    "beserta\n",
    "biarpun\n",
    "bila\n",
    "bilakah\n",
    "bilamana\n",
    "bisa\n",
    "boleh\n",
    "bukan\n",
    "bukankah\n",
    "bukanlah\n",
    "dahulu\n",
    "dalam\n",
    "dalamnya\n",
    "dan\n",
    "dapat\n",
    "dapati\n",
    "dapatkah\n",
    "dapatlah\n",
    "dari\n",
    "daripada\n",
    "daripadaku\n",
    "daripadamu\n",
    "daripadanya\n",
    "demi\n",
    "demikian\n",
    "demikianlah\n",
    "dengan\n",
    "dengannya\n",
    "di\n",
    "dia\n",
    "dialah\n",
    "didapat\n",
    "didapati\n",
    "dimanakah\n",
    "engkau\n",
    "engkaukah\n",
    "engkaulah\n",
    "engkaupun\n",
    "hai\n",
    "hampir\n",
    "hampir-hampir\n",
    "hanya\n",
    "hanyalah\n",
    "hendak\n",
    "hendaklah\n",
    "hingga\n",
    "ia\n",
    "iaitu\n",
    "ialah\n",
    "ianya\n",
    "inginkah\n",
    "ini\n",
    "inikah\n",
    "inilah\n",
    "itu\n",
    "itukah\n",
    "itulah\n",
    "jadi\n",
    "jangan\n",
    "janganlah\n",
    "jika\n",
    "jikalau\n",
    "jua\n",
    "juapun\n",
    "juga\n",
    "kalau\n",
    "kami\n",
    "kamikah\n",
    "kamipun\n",
    "kamu\n",
    "kamukah\n",
    "kamupun\n",
    "katakan\n",
    "ke\n",
    "kecuali\n",
    "kelak\n",
    "kembali\n",
    "kemudian\n",
    "kepada\n",
    "kepadaku\n",
    "kepadakulah\n",
    "kepadamu\n",
    "kepadanya\n",
    "kepadanyalah\n",
    "kerana\n",
    "kerananya\n",
    "kesan\n",
    "ketika\n",
    "kini\n",
    "kita\n",
    "ku\n",
    "kurang\n",
    "lagi\n",
    "lain\n",
    "lalu\n",
    "lamanya\n",
    "langsung\n",
    "lebih\n",
    "maha\n",
    "mahu\n",
    "mahukah\n",
    "mahupun\n",
    "maka\n",
    "malah\n",
    "mana\n",
    "manakah\n",
    "manapun\n",
    "masih\n",
    "masing\n",
    "masing-masing\n",
    "melainkan\n",
    "memang\n",
    "mempunyai\n",
    "mendapat\n",
    "mendapati\n",
    "mendapatkan\n",
    "mengadakan\n",
    "mengapa\n",
    "mengapakah\n",
    "mengenai\n",
    "menjadi\n",
    "menyebabkan\n",
    "menyebabkannya\n",
    "mereka\n",
    "merekalah\n",
    "merekapun\n",
    "meskipun\n",
    "mu\n",
    "nescaya\n",
    "niscaya\n",
    "nya\n",
    "olah\n",
    "oleh\n",
    "orang\n",
    "pada\n",
    "padahal\n",
    "padamu\n",
    "padanya\n",
    "paling\n",
    "para\n",
    "pasti\n",
    "patut\n",
    "patutkah\n",
    "per\n",
    "pergilah\n",
    "perkara\n",
    "perkaranya\n",
    "perlu\n",
    "pernah\n",
    "pertama\n",
    "pula\n",
    "pun\n",
    "sahaja\n",
    "saja\n",
    "saling\n",
    "sama\n",
    "sama-sama\n",
    "samakah\n",
    "sambil\n",
    "sampai\n",
    "sana\n",
    "sangat\n",
    "sangatlah\n",
    "saya\n",
    "se\n",
    "seandainya\n",
    "sebab\n",
    "sebagai\n",
    "sebagaimana\n",
    "sebanyak\n",
    "sebelum\n",
    "sebelummu\n",
    "sebelumnya\n",
    "sebenarnya\n",
    "secara\n",
    "sedang\n",
    "sedangkan\n",
    "sedikit\n",
    "sedikitpun\n",
    "segala\n",
    "sehingga\n",
    "sejak\n",
    "sekalian\n",
    "sekalipun\n",
    "sekarang\n",
    "sekitar\n",
    "selain\n",
    "selalu\n",
    "selama\n",
    "selama-lamanya\n",
    "seluruh\n",
    "seluruhnya\n",
    "sementara\n",
    "semua\n",
    "semuanya\n",
    "semula\n",
    "senantiasa\n",
    "sendiri\n",
    "sentiasa\n",
    "seolah\n",
    "seolah-olah\n",
    "seorangpun\n",
    "separuh\n",
    "sepatutnya\n",
    "seperti\n",
    "seraya\n",
    "sering\n",
    "serta\n",
    "seseorang\n",
    "sesiapa\n",
    "sesuatu\n",
    "sesudah\n",
    "sesudahnya\n",
    "sesungguhnya\n",
    "sesungguhnyakah\n",
    "setelah\n",
    "setiap\n",
    "siapa\n",
    "siapakah\n",
    "sini\n",
    "situ\n",
    "situlah\n",
    "suatu\n",
    "sudah\n",
    "sudahkah\n",
    "sungguh\n",
    "sungguhpun\n",
    "supaya\n",
    "tadinya\n",
    "tahukah\n",
    "tak\n",
    "tanpa\n",
    "tanya\n",
    "tanyakanlah\n",
    "tapi\n",
    "telah\n",
    "tentang\n",
    "tentu\n",
    "terdapat\n",
    "terhadap\n",
    "terhadapmu\n",
    "termasuk\n",
    "terpaksa\n",
    "tertentu\n",
    "tetapi\n",
    "tiada\n",
    "tiadakah\n",
    "tiadalah\n",
    "tiap\n",
    "tiap-tiap\n",
    "tidak\n",
    "tidakkah\n",
    "tidaklah\n",
    "turut\n",
    "untuk\n",
    "untukmu\n",
    "wahai\n",
    "walau\n",
    "walaupun\n",
    "ya\n",
    "yaini\n",
    "yaitu\n",
    "yakni\n",
    "yang\n",
    "\"\"\"\n",
    "stopwords = [s for s in stopwords.split('\\n') if len(s)]\n",
    "stopwords2 = ['saya', 'awak', 'yang', 'kamu', 'mereka', 'kita', 'ini', 'juga', 'dengan']\n",
    "stopwords3 = stopwords2 + ['akan', 'sebagai', 'di', 'dalam', 'atau', 'ke', 'jika']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from malaya.text.vectorizer import SkipGramTfidfVectorizer\n",
    "\n",
    "tfidf = TfidfVectorizer()\n",
    "skip = SkipGramTfidfVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 24.8 ms, sys: 0 ns, total: 24.8 ms\n",
      "Wall time: 22.4 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['mendadak bagai cendawam tumbuh selepas hujan',\n",
       " 'sikap ibu bapa merupakan punca utama',\n",
       " 'berganding bahu bagai aur',\n",
       " 'seekor kerbau membawa lumpur',\n",
       " 'angin masakan pokok bergoyang',\n",
       " 'kalangan generasi pelapis negara',\n",
       " 'bapa borek anak rintik',\n",
       " 'garang suka memarahi anak',\n",
       " 'ibu bapa memarahi anak',\n",
       " 'anak-anak lantara kesibukan bekerja',\n",
       " 'masalah disiplin semakin meningkat',\n",
       " 'disiplin pelajar semakin membimbangkan']"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.rake(after[1], \n",
    "                                          top_k = 100, stopwords=stopwords)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.5 ms, sys: 479 µs, total: 13.9 ms\n",
      "Wall time: 11.7 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['terjebak dalam masalah disiplin semakin meningkat dari hari ke hari secara mendadak bagai cendawam tumbuh selepas hujan',\n",
       " 'sering disiarkan di media massa sama ada di media cetak atau di media elektronik',\n",
       " 'akan menjadi budaya dan popular dalam kalangan generasi pelapis negara pada masa akan datang',\n",
       " 'tidak mengambil kisah sama ada anak-anak berada di rumah atau di lur rumah',\n",
       " 'kehausan kasih sayang akan bertindak ganas atau kasar untuk melepaskan kemarahan atau ketidakpuashatiannya',\n",
       " 'akan belajar untuk bergaduh dan bercakap kasar di sekolah apabila diganggu oleh pelajar lain seperti',\n",
       " 'tidak akan melakukan perkara tidak baik di sekolah kerana bapa borek anak rintik',\n",
       " 'terlibat dalam masalah disiplin kerana jikalau tiada angin masakan pokok bergoyang',\n",
       " 'boleh melakukan apa-apa saja jika ibu bapa tidak berada di sisi',\n",
       " 'masalah disiplin dalam kalangan pelajar perlu dibendung seperti kata pepatah Melayu',\n",
       " 'mestilah ingat bahawa disebabkan nila setitik maka rosaklah susu belanga']"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.rake(after[1], \n",
    "                                          top_k = 100, stopwords=stopwords2)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.25 ms, sys: 0 ns, total: 3.25 ms\n",
      "Wall time: 3 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['mestilah ingat bahawa disebabkan nila setitik maka rosaklah susu belanga',\n",
       " 'semua pihak selain ibu bapa perlu berganding bahu bagai aur',\n",
       " 'ingin berehat untuk melepaskan lelah dan tidak mahu diganggu',\n",
       " 'masalah disiplin kerana jikalau tiada angin masakan pokok bergoyang',\n",
       " 'menyebabkan disiplin pelajar semakin membimbangkan seperti pengaruh rakan sebaya',\n",
       " 'ibu bapa perlu menjaga tingkah laku sendiri supaya anak',\n",
       " 'hari secara mendadak bagai cendawam tumbuh selepas hujan',\n",
       " 'Pepatah melayu mengatakan bahawa seekor kerbau membawa lumpur',\n",
       " 'berlaku apabila ibu bapa tidak dapat meluangkan masa bersama-sama',\n",
       " 'mengabaikan pelajaran dan hilang rasa minat untuk belajar',\n",
       " 'kalangan pelajar perlu dibendung seperti kata pepatah Melayu',\n",
       " 'terjejas dan menganggap bahawa keluarganya tidak selamat']"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.rake(after[1], \n",
    "                                          top_k = 100, stopwords=stopwords3)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.94 ms, sys: 253 µs, total: 9.19 ms\n",
      "Wall time: 8.27 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['Penduduk setempat mestilah membentuk satu persatuan',\n",
       " 'aktiviti hari keluarga diadakan',\n",
       " 'Melalui majlis rumah terbuka',\n",
       " 'melalui majlis rumah terbuka',\n",
       " 'digambarkan melalui pepatah Melayu',\n",
       " 'majlis rumah terbuka mampu mendekatkan ahli masyarakat',\n",
       " 'melalui Pertubuhan Rukun Tetangga',\n",
       " 'sebagainya majlis-majlis rumah terbuka',\n",
       " 'majlis-majlis rumah terbuka semasa',\n",
       " 'meningkatkan hubungan sesama jiran',\n",
       " 'mengenal hati budi jiran',\n",
       " 'prinsip cubit paha kiri',\n",
       " 'perlulah mengukuhkan semangat kejiranan']"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.textrank(after[2], model = tfidf,\n",
    "                                          top_k = 100, stopwords=stopwords)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 11.9 ms, sys: 5.17 ms, total: 17 ms\n",
      "Wall time: 13.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['persatuan penduduk atau pihak swasta boleh mengadakan aktiviti Hari Keluarga untuk memupuk semangat kejiranan dalam kalangan masyarakat setempat',\n",
       " 'jiran-jiran dapat berkumpul dan bekerjasama sambil berbincang tentang aktiviti',\n",
       " 'dapat membentuk ikatan silaturahim dan memperkukuh semangat kejiranan dalam kalangan jiran',\n",
       " 'pula hendaklah menyemarakkan budaya kunjung-mengunjungi dalam kalangan penduduk sebagai langkah untuk memupuk dan mengekalkan semangat kejiranan',\n",
       " 'semua ahli keluarga jiran akan turut serta untuk melibatkan diri dalam aktiviti seperti berkayak',\n",
       " 'wajar hendaklah diambil untuk memupuk semangat kejiranan dalam kalangan penduduk setempat',\n",
       " 'perlu memupuk semangat kejiranan dalam kalangan penduduk setempat',\n",
       " 'semangat kejiranan akan luntur dan seterusnya akan menjejaskan perpaduan negara',\n",
       " 'dapat menjalinkan dan meningkatkan hubungan sesama jiran',\n",
       " 'akan dikenali sebagai jiran tetangga',\n",
       " 'harus menggalakkan rakyat supaya melibatkan diri dalam aktiviti sosial seperti persatuan atau pertubuhan']"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.textrank(after[2], model = tfidf,\n",
    "                                          top_k = 100, stopwords=stopwords2)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.92 ms, sys: 2.31 ms, total: 6.23 ms\n",
      "Wall time: 4.55 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['jiran-jiran dapat berkumpul dan bekerjasama sambil berbincang tentang aktiviti',\n",
       " 'langkah untuk memupuk dan mengekalkan semangat kejiranan',\n",
       " 'pihak swasta boleh mengadakan aktiviti Hari Keluarga untuk memupuk semangat kejiranan',\n",
       " 'dapat menjalinkan dan meningkatkan hubungan sesama jiran',\n",
       " 'dapat membentuk ikatan silaturahim dan memperkukuh semangat kejiranan',\n",
       " 'harus diambil ialah penduduk setempat hendaklah mengadakan aktiviti gotong-royong',\n",
       " 'dapat digambarkan melalui pepatah Melayu',\n",
       " 'Apabila aktiviti hari keluarga diadakan',\n",
       " 'majlis rumah terbuka mampu mendekatkan ahli masyarakat untuk bertemu muka dan berkenalan',\n",
       " 'semua ahli keluarga jiran',\n",
       " 'Penduduk setempat mestilah membentuk satu persatuan']"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "keywords_rake = malaya.keyword.extractive.textrank(after[2], model = tfidf,\n",
    "                                          top_k = 100, stopwords=stopwords3)\n",
    "keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "keywords_rake = keywords_rake[:random.randint(10, 15)]\n",
    "keywords_rake"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_stopwords = [stopwords, stopwords2, stopwords3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████| 367/367 [00:06<00:00, 55.07it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "new_before = []\n",
    "new_after = []\n",
    "\n",
    "minimum = 7\n",
    "\n",
    "for i in tqdm(range(len(after))):\n",
    "    s = after[i]\n",
    "    \n",
    "    for l in l_stopwords:\n",
    "        keywords_rake = malaya.keyword.extractive.rake(s, \n",
    "                                              top_k = 100, stopwords=l)\n",
    "        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "        keywords_rake = keywords_rake[:random.randint(minimum, 15)]\n",
    "\n",
    "        if len(keywords_rake) >= minimum:\n",
    "\n",
    "            new_before.append(keywords_rake)\n",
    "            new_after.append(s)\n",
    "    \n",
    "    for l in l_stopwords:\n",
    "        keywords_rake = malaya.keyword.extractive.textrank(s, model = tfidf,\n",
    "                                              top_k = 100, stopwords=l)\n",
    "        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) > 3 and len(k[1]) > 10]\n",
    "        keywords_rake = keywords_rake[:random.randint(minimum, 15)]\n",
    "\n",
    "        if len(keywords_rake) >= minimum:\n",
    "\n",
    "            new_before.append(keywords_rake)\n",
    "            new_after.append(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2533"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(before + new_before)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('karangan.json', 'w') as fopen:\n",
    "    json.dump({'before': before + new_before, 'after': after + new_after}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
